{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1cddfb2e-e508-4410-b32d-fd3452298004",
   "metadata": {},
   "source": [
    "#### Objective: Use race migration table to generate race migration points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5b77a6ef-ef4e-45fe-8b43-6545313d4556",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import ast,os,random\n",
    "pd.set_option('display.float_format','{:.1f}'.format)\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import cudf, cupy as cp\n",
    "import numpy as np\n",
    "import time\n",
    "import math\n",
    "import sys,os,datetime,random\n",
    "from shapely.geometry import Point\n",
    "# pd.set_option('display.max_colwidth', -1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e99e275d-c977-4392-b81a-bbb73bc5ee4f",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "66319687-f2ba-4d55-85df-83692db81f96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>52.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>81.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001005</td>\n",
       "      <td>1</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20  STATE  points\n",
       "0  10010201001000      1    52.0\n",
       "1  10010201001001      1    34.0\n",
       "2  10010201001002      1    81.0\n",
       "3  10010201001003      1    17.0\n",
       "4  10010201001005      1     8.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = cudf.read_csv('data/total_population_gen_df.csv').drop('Unnamed: 0',axis=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "97e9b49c-c118-4ca3-b973-c5f36c2499fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = df[df.STATE==6]\n",
    "# len(df)//3\n",
    "# df= df.iloc[:len(df)//3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c44ff46f-c2ef-4ebe-acfe-7dcc7bde6db8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "161904\n"
     ]
    }
   ],
   "source": [
    "print(len(df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7d6527f6-8214-4494-81d7-2c7c78b1f80f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def random_points_in_polygon(number, polygon):\n",
    "    # print(polygon)\n",
    "    points_x = np.array([])\n",
    "    points_y = np.array([])\n",
    "    min_x, min_y, max_x, max_y = polygon.bounds\n",
    "    i= 0\n",
    "    while i < number:\n",
    "        point_x = random.uniform(min_x, max_x)\n",
    "        point_y = random.uniform(min_y, max_y)\n",
    "        if polygon.contains(Point(point_x, point_y)):\n",
    "            points_x = np.append(points_x, point_x)\n",
    "            points_y = np.append(points_y, point_y)\n",
    "            i += 1\n",
    "    return points_x, points_y # returns list of points(lat), list of points(long)\n",
    "def generate_data(state, df_temp, gpdf):\n",
    "    t1 = datetime.datetime.now()\n",
    "    geoid_index_df = df_temp.index.to_numpy()\n",
    "    final_points_x = np.array([])\n",
    "    final_points_y = np.array([])\n",
    "    geoid = np.array([])\n",
    "    # Add additional features\n",
    "    county = np.array([])\n",
    "    p_delta = np.array([])\n",
    "    p_net = np.array([])\n",
    "    \n",
    "    \n",
    "    f=0\n",
    "    for index, row in gpdf.iterrows():\n",
    "        f+=1\n",
    "        points_x = np.array([])\n",
    "        points_y = np.array([])\n",
    "        geoid_temp = np.array([])\n",
    "        \n",
    "        if row['GEOID20'] in geoid_index_df:\n",
    "            num_points = df_temp.loc[row['GEOID20']]\n",
    "            polygon = row['geometry']\n",
    "            #print(row['GEOID10'])\n",
    "            #print('SUCCESS')\n",
    "            num_points = df_temp.loc[row['GEOID20']] # store population\n",
    "            polygon = row['geometry']\n",
    "\n",
    "      \n",
    "            if polygon is not None:\n",
    "                points_x, points_y = random_points_in_polygon(num_points, polygon)\n",
    "                # print(points_x,points_y)\n",
    "                geoid_temp = np.array([row['GEOID20']]*len(points_x))\n",
    "                geoid = np.append(geoid,geoid_temp)\n",
    "                final_points_x = np.append(final_points_x, points_x)\n",
    "                # print(final_points_x)\n",
    "                final_points_y = np.append(final_points_y, points_y)\n",
    "                print('Processing '+str(state)+' - Completed:', \"{0:0.2f}\".format((index/len(gpdf))*100), '%', end='')\n",
    "                print('', end='\\r')\n",
    "                \n",
    "            # if f==11:\n",
    "            #     break\n",
    "\n",
    "    print('Processing for '+str(state)+' complete \\n total time', datetime.datetime.now() - t1)\n",
    "    df_fin = cudf.DataFrame({'GEOID20': geoid,'x': final_points_x, 'y':final_points_y}) #,'COUNTY':county,'p_delta':p_delta,'p_net':p_net})\n",
    "    df_fin.GEOID20 = df_fin.GEOID20[1:].astype('int').astype('str')\n",
    "    df_fin.GEOID20 = df_fin.GEOID20.fillna(method='bfill')\n",
    "    \n",
    "    df_fin.to_csv('data/total_population/population_%s_1'%str(state)+'.csv', index=False)\n",
    "def exec_data(state_key_list):\n",
    "    c=0\n",
    "    for i in state_key_list:\n",
    "        print(i)\n",
    "        c+=1\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        # path = 'census_2020_data/nhgis0003_shape/nhgis0003_shapefile_tl2020_%s0_block_2020/%s_block_2020.shp'%(i_str,states[i])\n",
    "        path ='data/tl_shapefiles/tl_2021_%s_tabblock20.shp'%(i_str)\n",
    "        #print(path)\n",
    "        print(\"started reading shape file for state \", states[i])\n",
    "        if os.path.isfile(path):    \n",
    "            gpdf = gpd.read_file(path)[['GEOID20', 'geometry']].sort_values('GEOID20').reset_index(drop=True)\n",
    "            gpdf.GEOID20 = gpdf.GEOID20.astype('int64')\n",
    "            gpdf = gpdf[(gpdf.GEOID20>=480019501001000) & (gpdf.GEOID20<=481439502032029)].reset_index(drop=True)\n",
    "            print(\"completed reading shape file for state \", states[i])\n",
    "            df_temp = df.query('STATE == @i')[['ID20', 'points']]\n",
    "            df_temp.index = df_temp.ID20\n",
    "            df_temp = df_temp['points']\n",
    "            # print(gpdf.head(3))\n",
    "            # print(df_temp)\n",
    "            print(\"starting to generate data for \"+str(states[i])+\"... \")\n",
    "            generate_data(states[i], df_temp, gpdf)\n",
    "            del(df_temp)\n",
    "        else:\n",
    "            print(\"shape file does not exist\")\n",
    "            continue\n",
    "        # if c==2:\n",
    "        #     break "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "29a0e4e2-a41d-45a6-b6aa-aa8c87ddf5ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "#           16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "#           28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "#           40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "#           54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}\n",
    "# states = {6:\"CA\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4e6f6e62-cbeb-4a67-aee4-024ab9af2f07",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "48\n",
      "started reading shape file for state  TX\n",
      "completed reading shape file for state  TX\n",
      "starting to generate data for TX... \n",
      "Processing for TX complete 100.00 %\n",
      " total time 3:08:48.306832\n"
     ]
    }
   ],
   "source": [
    "exec_data(states.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f48af1e-14fa-467b-b7b4-f773427a45dc",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Concat Parts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd9ea4cd-2457-46cf-9824-6eac0d41975c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_parts(state_key_list):\n",
    "    concat_states = cudf.DataFrame()\n",
    "    c=0\n",
    "    for i in state_key_list:\n",
    "        for c in range(1,4):\n",
    "            if i< 10:\n",
    "                i_str = '0'+str(i)\n",
    "            else:\n",
    "                i_str = str(i)\n",
    "            path = 'data/total_population/population_%s_%s'%(str(states[i]),c)+'.csv'\n",
    "            # print(path)\n",
    "            if os.path.isfile(path):    \n",
    "                temp = cudf.read_csv(path) # Load shape files\n",
    "                concat_states = cudf.concat([concat_states,temp])\n",
    "            else:\n",
    "                print(\"population file does not exist\")\n",
    "                continue\n",
    "        return concat_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a78e1dfc-8105-4302-bac5-f8b7c2e159de",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_parts = merge_parts(states)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ef51cef2-4bd2-451f-a55e-26a86499fc3f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID20</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60014001001001</td>\n",
       "      <td>-122.2</td>\n",
       "      <td>37.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>60014001001001</td>\n",
       "      <td>-122.2</td>\n",
       "      <td>37.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>60014001001001</td>\n",
       "      <td>-122.2</td>\n",
       "      <td>37.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60014001001001</td>\n",
       "      <td>-122.2</td>\n",
       "      <td>37.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>60014001001001</td>\n",
       "      <td>-122.2</td>\n",
       "      <td>37.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59325523</th>\n",
       "      <td>61150411021048</td>\n",
       "      <td>-121.3</td>\n",
       "      <td>39.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59325524</th>\n",
       "      <td>61150411021048</td>\n",
       "      <td>-121.3</td>\n",
       "      <td>39.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59325525</th>\n",
       "      <td>61150411021048</td>\n",
       "      <td>-121.3</td>\n",
       "      <td>39.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59325526</th>\n",
       "      <td>61150411021048</td>\n",
       "      <td>-121.3</td>\n",
       "      <td>39.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59325527</th>\n",
       "      <td>61150411021048</td>\n",
       "      <td>-121.3</td>\n",
       "      <td>39.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>59325528 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 GEOID20      x    y\n",
       "0         60014001001001 -122.2 37.9\n",
       "1         60014001001001 -122.2 37.9\n",
       "2         60014001001001 -122.2 37.9\n",
       "3         60014001001001 -122.2 37.9\n",
       "4         60014001001001 -122.2 37.9\n",
       "...                  ...    ...  ...\n",
       "59325523  61150411021048 -121.3 39.4\n",
       "59325524  61150411021048 -121.3 39.4\n",
       "59325525  61150411021048 -121.3 39.4\n",
       "59325526  61150411021048 -121.3 39.4\n",
       "59325527  61150411021048 -121.3 39.4\n",
       "\n",
       "[59325528 rows x 3 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_parts =concat_parts.reset_index(drop=True)\n",
    "concat_parts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d72bb78c-3a19-4a4f-8d91-6ec056169ff9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42742567.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.STATE==48].points.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "80c9b8b2-4975-4508-b160-96415f5e72af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59325528.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.points.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "496e325f-c830-4e5a-bfe1-a9f8016376b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "concat_parts.to_pandas().to_csv('data/total_population/population_CA')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13fe141e-7175-4ee8-923d-80b991dd04f5",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Concat States"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "848adcd4-502b-4296-a4cf-92e3ab9ff965",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_shape_and_states(state_key_list):\n",
    "    concat_states = cudf.DataFrame()\n",
    "    \n",
    "    for i in state_key_list:\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        path = 'data/total_population/population_%s'%str(states[i])+'.csv'\n",
    "        if os.path.isfile(path):    \n",
    "            temp = cudf.read_csv(path) # Load shape files\n",
    "            concat_states = cudf.concat([concat_states,temp])\n",
    "        else:\n",
    "            print(i)\n",
    "            print(\"population file does not exist\")\n",
    "            continue\n",
    "        print(i)\n",
    "    return concat_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "50dcda12-ae59-45d1-9dc5-50e93ee5692c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "#           16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "#           28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "#           40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "#           54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}\n",
    "states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "          16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "          28:\"MS\"} # part1\n",
    "states = {29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "          40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "          54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"} #part2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "438ad7af-c7a4-4d39-9741-a4cc0c87859c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29\n",
      "30\n",
      "31\n",
      "32\n",
      "33\n",
      "34\n",
      "35\n",
      "36\n",
      "37\n",
      "38\n",
      "39\n",
      "40\n",
      "41\n",
      "42\n",
      "44\n",
      "45\n",
      "46\n",
      "47\n",
      "48\n",
      "49\n",
      "50\n",
      "51\n",
      "53\n",
      "54\n",
      "55\n",
      "56\n",
      "72\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>290019501001000</td>\n",
       "      <td>-92.4</td>\n",
       "      <td>40.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>290019501001000</td>\n",
       "      <td>-92.4</td>\n",
       "      <td>40.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>290019501001001</td>\n",
       "      <td>-92.4</td>\n",
       "      <td>40.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>290019501001001</td>\n",
       "      <td>-92.4</td>\n",
       "      <td>40.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>290019501001001</td>\n",
       "      <td>-92.4</td>\n",
       "      <td>40.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              ID20     x    y\n",
       "0  290019501001000 -92.4 40.3\n",
       "1  290019501001000 -92.4 40.3\n",
       "2  290019501001001 -92.4 40.3\n",
       "3  290019501001001 -92.4 40.3\n",
       "4  290019501001001 -92.4 40.3"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indv_df = merge_shape_and_states(states.keys()).drop('Unnamed: 0',axis=1)\n",
    "indv_df.rename(columns={'GEOID20':'ID20'},inplace=True)\n",
    "indv_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9fd6b2fe-5fbf-47b1-93d2-7bd18dfeeb7b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "248001113"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(indv_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bfb9e3e-bfeb-45d6-ac7a-9475e2575577",
   "metadata": {},
   "outputs": [],
   "source": [
    "# indv_df.to_pandas().to_parquet('data/total_part1.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9c369907-9c5f-4874-9d2f-8c80a86b9c56",
   "metadata": {},
   "outputs": [],
   "source": [
    "# indv_df.to_pandas().to_parquet('data/total_part2.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d138976-2169-4049-9c7c-815657a2b08c",
   "metadata": {},
   "source": [
    "### Use processed dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "add2e952-e678-434c-84bc-68c3d6527b5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df1 = pd.read_parquet('data/total_part1.parquet')\n",
    "# df2 = pd.read_parquet('data/total_part2.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b9c7d374-9466-4e9e-bd74-ed84d0b25790",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merged = pd.concat([df1,df2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ea1d033d-dd0b-4827-b87b-623d11f02c6a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "504475979"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# len(merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5f3ef3c3-0170-4d7a-be20-f85b7250f2a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpu = cudf.from_pandas(merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "68e69c2c-286f-4abd-a0fb-ef11e4c4b851",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merged.to_parquet('data/total_parts_combined.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab83595d-dd0f-4e05-a587-49dbcee0b31c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset = indv_df.merge(df,on='ID20',how='left').sort_values('ID20')\n",
    "# dataset.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
